Data loading

The file "~/Documents/Postdoc_Biogeco/1_NGB/data/ORPHEE_leaf_microbiota/data_Qi_Qr_16S_filtered_formatted.RDS" and "~/Documents/Postdoc_Biogeco/1_NGB/data/ORPHEE_leaf_microbiota/data_Qi_Qr_ITS_filtered_formatted.RDS" are list objects with five elements:

rm(list=ls())
# Load packages
library(vegan)
library(beanplot)
library(ape)
library(RColorBrewer)
library(knitr)
library(kableExtra)
library(ranger)
source("~/Documents/Postdoc_Biogeco/1_NGB/src/src_function_points_jitter.R")
source("~/Documents/Postdoc_Biogeco/2_DROUGHT/src/src_function_conv_hull_pcoa.R")
source("~/Documents/Postdoc_Biogeco/1_NGB/src/src_function_agg_table_taxo.R")
paf <- "~/Documents/Postdoc_Biogeco/1_NGB"
  
## Loading 16S data -------------
tmp <- readRDS("~/Documents/Postdoc_Biogeco/1_NGB/data/ORPHEE_leaf_microbiota/data_Qi_Qr_16S_filtered_formatted.RDS")
asv_table_16S <- tmp[["asv_table"]]
samples_16S <- tmp[["samples"]]
assign_16S <- tmp[["assign"]]
wp_16S <- tmp[["water_potential"]]
rm(tmp)

# Remove Qr samples from the dataset:
asv_table_16S <- asv_table_16S[,-grep("PU2", colnames(asv_table_16S))]
# Remove ASVs not present in samples:
asv_table_16S <- asv_table_16S[apply(asv_table_16S,1,sum)>0,]
# Remove assignation of ASVs not present in samples:
assign_16S <- assign_16S[match(rownames(asv_table_16S),
                                        rownames(assign_16S)),]
# Remove Qr samples from the metadata
samples_16S <- samples_16S[match(colnames(asv_table_16S),
                                          rownames(samples_16S)),]

# remove endophytes samples from the dataset: 
endo_16S <- subset(samples_16S, sample_type=="endo")
asv_table_endo_16S <- asv_table_16S[,match(rownames(endo_16S),
                                           dimnames(asv_table_16S)[[2]])]

samples_16S <- subset(samples_16S, sample_type=="micro")
asv_table_16S <- asv_table_16S[,match(rownames(samples_16S),
                                      dimnames(asv_table_16S)[[2]])]

## Loading ITS data -------------
tmp <- readRDS("~/Documents/Postdoc_Biogeco/1_NGB/data/ORPHEE_leaf_microbiota/data_Qi_Qr_ITS_filtered_formatted.RDS")
asv_table_ITS <- tmp[["asv_table"]]
samples_ITS <- tmp[["samples"]]
assign_ITS <- tmp[["assign"]]
wp_ITS <- tmp[["water_potential"]]
rm(tmp)

# Remove Qr samples from the dataset:
asv_table_ITS <- asv_table_ITS[,-grep("PU2", colnames(asv_table_ITS))]
# Remove ASVs not present in samples:
asv_table_ITS <- asv_table_ITS[apply(asv_table_ITS,1,sum)>0,]
# Remove assignation of ASVs not present in samples:
assign_ITS <- assign_ITS[match(rownames(asv_table_ITS),
                                        rownames(assign_ITS)),]
# Remove Qr samples from the metadata
samples_ITS <- samples_ITS[match(colnames(asv_table_ITS),
                                          rownames(samples_ITS)),]

# remove endophytes samples from the dataset: 
endo_ITS <- subset(samples_ITS, sample_type=="endo")
asv_table_endo_ITS <- asv_table_ITS[,match(rownames(endo_ITS),
                                           dimnames(asv_table_ITS)[[2]])]

samples_ITS <- subset(samples_ITS, sample_type=="micro")
asv_table_ITS <- asv_table_ITS[,match(rownames(samples_ITS),
                                      dimnames(asv_table_ITS)[[2]])]

# Colors 
source("~/Documents/Postdoc_Biogeco/1_NGB/src/project_colors_v2.R")

Sample metadata are loaded twice because samples that were successfully sequenced and that passed data curation are not exactly the same for 16S and ITS data. For now I don’t want to reduce the data sets to their intersection so it is more convenient to keep these duplicated data, but I could think later of a way to merge them.

Alpha diversity

Calculation of classical alpha diversity indices for each sample

samples_16S$shannon <- diversity(asv_table_16S, index = "shannon", MARGIN = 2)
samples_16S$chao1 <- estimateR(t(asv_table_16S))["S.chao1",]

samples_ITS$shannon <- diversity(asv_table_ITS, index = "shannon", MARGIN = 2)
samples_ITS$chao1 <- estimateR(t(asv_table_ITS))["S.chao1",]
par(mfrow=c(1,2), bty="l",las=1, mar=c(4,4.5,2,0))
beanplot(chao1~irrigation,
         data=samples_16S,
         log="",
         xlab="", col.axis="darkgray",
         what=c(0,1,1,0),
         cex.lab=1.5,
         cutmin=0,
         cex.axis=1,
         ylim=c(0,300),
         border="gray",
         #border=colors[c("Bp", "Pp", "Qi", "Qr")],
         #names=c("BRAN", "CDS1","CDS2","CDS3", "MTF1", "MTF2","MTF3"),
         ylab = "Shannon index",
         col=as.list("white"),
         main="Bacteria")
## new function
pt.jitter(samples_16S$irrigation, samples_16S$chao1, 
          bg = hue_irr[samples_16S$irrigation], alpha = 0.6)

beanplot(chao1~irrigation,
         data=samples_ITS,
         log="",
         xlab="", col.axis="darkgray",
         what=c(0,1,1,0),
         cex.lab=1.5,
         cutmin=0,
         cex.axis=1,
         ylim=c(0,300),
         border="gray",
         #border=colors[c("Bp", "Pp", "Qi", "Qr")],
         #names=c("BRAN", "CDS1","CDS2","CDS3", "MTF1", "MTF2","MTF3"),
         ylab = "Shannon index",
         col=as.list("white"),
         main="Fungi")
## new function
pt.jitter(samples_ITS$irrigation, samples_ITS$chao1, 
          bg = hue_irr[samples_ITS$irrigation], alpha = 0.6)

par(mfrow=c(1,2), bty="l",las=1, mar=c(4,4.5,2,0))
beanplot(shannon~irrigation,
         data=samples_16S,
         log="",
         xlab="", col.axis="darkgray",
         what=c(0,1,1,0),
         cex.lab=1.5,
         cutmin=0,
         cex.axis=1,
         ylim=c(0,6),
         border="gray",
         #border=colors[c("Bp", "Pp", "Qi", "Qr")],
         #names=c("BRAN", "CDS1","CDS2","CDS3", "MTF1", "MTF2","MTF3"),
         ylab = "Shannon index",
         col=as.list("white"),
         main="Bacteria")
## new function
pt.jitter(samples_16S$irrigation, samples_16S$shannon, 
          bg = hue_irr[samples_16S$irrigation], alpha = 0.6)

beanplot(shannon~irrigation,
         data=samples_ITS,
         log="",
         xlab="", col.axis="darkgray",
         what=c(0,1,1,0),
         cex.lab=1.5,
         cutmin=0,
         cex.axis=1,
         ylim=c(0,6),
         border="gray",
         #border=colors[c("Bp", "Pp", "Qi", "Qr")],
         #names=c("BRAN", "CDS1","CDS2","CDS3", "MTF1", "MTF2","MTF3"),
         ylab = "Shannon index",
         col=as.list("white"),
         main="Fungi")
## new function
pt.jitter(samples_ITS$irrigation, samples_ITS$shannon, 
          bg = hue_irr[samples_ITS$irrigation], alpha = 0.6)

par(mfrow=c(1,2), bty="l",las=1, mar=c(4,4.5,2,0),
    cex.lab=1.5, col.axis="darkgray")
plot(chao1~seq_depth, 
     col="gray", 
     pch=21,
     bg=adjustcolor("gray", alpha.f = 0.6),
     main="Bacteria",
     data=samples_16S)
plot(chao1~seq_depth, 
     col="gray", 
     pch=21,
     bg=adjustcolor("gray", alpha.f = 0.6),
     main="Fungi",
     data=samples_ITS)

plot(shannon~seq_depth, 
     col="gray", 
     pch=21,
     bg=adjustcolor("gray", alpha.f = 0.6),
     main="Bacteria",
     data=samples_16S)
plot(shannon~seq_depth, 
     col="gray", 
     pch=21,
     bg=adjustcolor("gray", alpha.f = 0.6),
     main="Fungi",
     data=samples_ITS)

Community composition

Complete PERMANOVA

PERMANOVA analysis based on Bray-Curtis distance.

Bacteria

z <- samples_16S
tab <- asv_table_16S

(p <- adonis2(t(tab)~seq_depth+irrigation/block*month*leaf_age*psi_predawn_mean_sampling, 
              permutations = 999, method="bray",
              data=z))
saveRDS(p, "PERMANOVA_Qi_tot_16S.RDS")
cat(
kable(p[,c(1,3,5)], digits = c(0,2,3),"latex", booktabs = T)%>%
  row_spec(c(1:6), bold = T, color = "black"),
    file ="tab_PERMANOVA_Qi_tot_16S.txt"
)
readRDS("PERMANOVA_Qi_tot_16S.RDS")
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = t(tab) ~ seq_depth + irrigation/block * month * leaf_age * psi_predawn_mean_sampling, data = z, permutations = 999, method = "bray")
##                                       Df SumOfSqs      R2       F Pr(>F)    
## seq_depth                              1    4.521 0.06200 17.1597  0.001 ***
## irrigation                             1    1.964 0.02694  7.4551  0.001 ***
## month                                  1    1.322 0.01813  5.0172  0.001 ***
## leaf_age                               1    4.118 0.05647 15.6286  0.001 ***
## psi_predawn_mean_sampling              1    0.637 0.00874  2.4177  0.005 ** 
## irrigation:block                       6    2.543 0.03487  1.6085  0.001 ***
## irrigation:month                       1    0.384 0.00526  1.4564  0.080 .  
## irrigation:leaf_age                    1    0.402 0.00551  1.5254  0.055 .  
## irrigation:psi_predawn_mean_sampling   1    0.265 0.00363  1.0044  0.424    
## month:psi_predawn_mean_sampling        1    0.208 0.00286  0.7909  0.727    
## leaf_age:psi_predawn_mean_sampling     1    0.306 0.00419  1.1610  0.227    
## irrigation:month:block                 1    0.176 0.00242  0.6691  0.894    
## irrigation:leaf_age:block              3    0.743 0.01019  0.9398  0.585    
## Residual                             210   55.333 0.75879                   
## Total                                230   72.923 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Irrigation treatment explains 3% of the variance in bacterial community composition
  • There is a significant effect of predawn water potential, but as we only have one value of wp per block, we need to disentangle the block and the irrigation effect. We should probably replace the block variable per a pcnm of the trees coordinates.

Fungi

z <- samples_ITS
tab <- asv_table_ITS

(p <- adonis2(t(tab)~seq_depth+irrigation/block*month*leaf_age*psi_predawn_mean_sampling,
              permutations = 999, method="bray",
              data=z))
saveRDS(p, "PERMANOVA_Qi_tot_ITS.RDS")
cat(
kable(p[,c(1,3,5)], digits = c(0,2,3),"latex", booktabs = T)%>%
  row_spec(c(1:8, 11:13), bold = T, color = "black"),
    file ="tab_PERMANOVA_Qi_tot_ITS.txt"
)
readRDS("PERMANOVA_Qi_tot_ITS.RDS")
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = t(tab) ~ seq_depth + irrigation/block * month * leaf_age * psi_predawn_mean_sampling, data = z, permutations = 999, method = "bray")
##                                       Df SumOfSqs      R2       F Pr(>F)    
## seq_depth                              1    1.960 0.02510  7.6098  0.001 ***
## irrigation                             1    1.619 0.02073  6.2857  0.001 ***
## month                                  1    3.086 0.03951 11.9790  0.001 ***
## leaf_age                               1    7.268 0.09305 28.2124  0.001 ***
## psi_predawn_mean_sampling              1    0.670 0.00858  2.6010  0.002 ** 
## irrigation:block                       6    5.121 0.06556  3.3129  0.001 ***
## irrigation:month                       1    0.443 0.00567  1.7193  0.039 *  
## irrigation:leaf_age                    1    0.635 0.00813  2.4663  0.006 ** 
## irrigation:psi_predawn_mean_sampling   1    0.259 0.00332  1.0063  0.418    
## month:psi_predawn_mean_sampling        1    0.278 0.00356  1.0780  0.320    
## leaf_age:psi_predawn_mean_sampling     1    0.644 0.00825  2.5000  0.006 ** 
## irrigation:month:block                 1    0.485 0.00621  1.8839  0.020 *  
## irrigation:leaf_age:block              3    1.282 0.01641  1.6585  0.008 ** 
## Residual                             211   54.356 0.69592                   
## Total                                231   78.107 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Irrigation treatment explains 3% of the variance in fungal community composition
  • There is a significant effect of predawn water potential, but as we only have one value of wp per block, we need to disentangle the block and the irrigation effect. We should probably replace the block variable per a pcnm of the trees coordinates.
  • It looks that there are more spatial effects in fungi than in bacteria

Visualization of the irrigation treatment effect

Clr transformation of the data

Due to the compositional nature of microbiota data and differences in sequencing depth between species, it is necessary to normalize data. I perform here a centered log-ratio (clr) transformation after modeling low count abundances with Monte Carlo replicates of the Dirichlet distribution (see Fernandes et al. (2014) Microbiome). The clr transformation shifts discrete data to continuous data.

Here 128 Monte Carlo samples of the Dirichlet distribution were made for each samples. The 128 matrices obtained were then averaged, following C. Pauvert’s thesis.

library(ALDEx2)

asv.clr <- aldex.clr(asv_table_16S, verbose=T,
                     useMC = F, denom = "all", mc.samples = 128)
asv.clr.mean<-sapply(getMonteCarloInstances(asv.clr),rowMeans) # Average over the different instances
saveRDS(asv.clr.mean,"aldex_clr_mean_16S.RDS") # Write to disk

asv.clr <- aldex.clr(asv_table_ITS, verbose=T,
                     useMC = F, denom = "all", mc.samples = 128)
asv.clr.mean<-sapply(getMonteCarloInstances(asv.clr),rowMeans) # Average over the different instances
saveRDS(asv.clr.mean,"aldex_clr_mean_ITS.RDS") # Write to disk

Bacteria

par(mfrow=c(1,2))
z <- samples_16S
tab <- asv_table_16S

dist <- vegdist(decostand(t(tab),method="hell"), 
                  method = "bray", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_irr[z$irrigation], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_irr,
       col=hue_irr,
       legend=names(hue_irr))

conv.hull.pcoa(pp, groups = z$irrigation, colors=hue_irr)
# dev.off()

z <- samples_16S
tab <- readRDS("aldex_clr_mean_16S.RDS")

dist <- vegdist(t(tab), method = "euclidian", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_irr[z$irrigation], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Clr, Euclidian")
legend("bottomleft", pch=21,
       pt.bg = hue_irr,
       col=hue_irr,
       legend=names(hue_irr))

conv.hull.pcoa(pp, groups = z$irrigation, colors=hue_irr)

# dev.off()
par(mfrow=c(1,2))
z <- samples_16S
tab <- asv_table_16S

dist <- vegdist(decostand(t(tab),method="hell"), 
                  method = "bray", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_age[z$leaf_age], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_age,
       col=hue_leaf_age,
       legend=names(hue_leaf_age))

conv.hull.pcoa(pp, groups = z$leaf_age, colors=hue_leaf_age)
# dev.off()

z <- samples_16S
tab <- readRDS("aldex_clr_mean_16S.RDS")

dist <- vegdist(t(tab), method = "euclidian", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_age[z$leaf_age], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_age,
       col=hue_leaf_age,
       legend=names(hue_leaf_age))

conv.hull.pcoa(pp, groups = z$leaf_age, colors=hue_leaf_age)

# dev.off()
par(mfrow=c(1,2))
z <- samples_16S
tab <- asv_table_16S

dist <- vegdist(decostand(t(tab),method="hell"), 
                  method = "bray", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_type[z$leaf_type], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_type,
       col=hue_leaf_type,
       legend=names(hue_leaf_type))

#conv.hull.pcoa(pp, groups = z$leaf_type, colors=hue_leaf_type)
# dev.off()

z <- samples_16S
tab <- readRDS("aldex_clr_mean_16S.RDS")

dist <- vegdist(t(tab), method = "euclidian", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_type[z$leaf_type], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_type,
       col=hue_leaf_type,
       legend=names(hue_leaf_type))

#conv.hull.pcoa(pp, groups = z$leaf_type, colors=hue_leaf_type)
# dev.off()

Fungi

par(mfrow=c(1,2))
z <- samples_ITS
tab <- asv_table_ITS

dist <- vegdist(decostand(t(tab),method="hell"), 
                  method = "bray", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_irr[z$irrigation], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_ITS.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_irr,
       col=hue_irr,
       legend=names(hue_irr))

conv.hull.pcoa(pp, groups = z$irrigation, colors=hue_irr)
# dev.off()

z <- samples_ITS
tab <- readRDS("aldex_clr_mean_ITS.RDS")

dist <- vegdist(t(tab), method = "euclidian", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_irr[z$irrigation], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_ITS.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Clr, Euclidian")
legend("bottomleft", pch=21,
       pt.bg = hue_irr,
       col=hue_irr,
       legend=names(hue_irr))

conv.hull.pcoa(pp, groups = z$irrigation, colors=hue_irr)

# dev.off()
par(mfrow=c(1,2))
z <- samples_ITS
tab <- asv_table_ITS

dist <- vegdist(decostand(t(tab),method="hell"), 
                  method = "bray", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_age[z$leaf_age], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_age,
       col=hue_leaf_age,
       legend=names(hue_leaf_age))

conv.hull.pcoa(pp, groups = z$leaf_age, colors=hue_leaf_age)
# dev.off()

z <- samples_ITS
tab <- readRDS("aldex_clr_mean_ITS.RDS")

dist <- vegdist(t(tab), method = "euclidian", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_age[z$leaf_age], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_age,
       col=hue_leaf_age,
       legend=names(hue_leaf_age))

conv.hull.pcoa(pp, groups = z$leaf_age, colors=hue_leaf_age)

# dev.off()
par(mfrow=c(1,2))
z <- samples_ITS
tab <- asv_table_ITS

dist <- vegdist(decostand(t(tab),method="hell"), 
                  method = "bray", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_type[z$leaf_type], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_type,
       col=hue_leaf_type,
       legend=names(hue_leaf_type))

conv.hull.pcoa(pp, groups = z$leaf_type, colors=hue_leaf_type)
# dev.off()

z <- samples_ITS
tab <- readRDS("aldex_clr_mean_ITS.RDS")

dist <- vegdist(t(tab), method = "euclidian", binary=F)

pp <- pcoa(dist)
coul <- adjustcolor(hue_leaf_type[z$leaf_type], alpha.f = 0.6)

# pdf(file.path(paf, "results/figures/fig_PCoA_hell_bray_Paracou_organ_16S.pdf"), 5, 5)
par(bty="l",las=1, mar=c(4,4,2,1))
plot(pp$vectors[,1], pp$vectors[,2], col=coul,
     bg=coul, 
     pch=21, cex=1.5,
     col.axis="grey50",
     cex.lab=1,
     cex.axis=1,
     xlab=paste("PCoA axis 1 (",round(pp$values[1,"Relative_eig"],digits = 2)*100," %)", sep=""),
     ylab = paste("PCoA axis 2 (",round(pp$values[2,"Relative_eig"],digits = 2)*100," %)", sep=""),
     main="Hellinger, Bray-Curtis")
legend("bottomleft", pch=21,
       pt.bg = hue_leaf_type,
       col=hue_leaf_type,
       legend=names(hue_leaf_type))

conv.hull.pcoa(pp, groups = z$leaf_type, colors=hue_leaf_type)

# dev.off()

Random forest classification

Here, I try to predict tree irrigation treatment from their leaf microbial communities, using random forest. I will try several mtry parameters and taxonomic levels, following Chang et al. (2017) DOI:10.3389/fmicb.2017.00519

source("src_function_rf_mtry_taxo.R")
# 16S
z <- samples_16S
tab <- asv_table_16S
taxo <- assign_16S

res_tot <- rf.mtry.taxo(tab, taxo,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_mtry_20_taxo_16S.RDS")

res_tot <- rf.mtry.taxo(tab, taxo, n_fold = 3,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_mtry_20_nfold_3_taxo_16S.RDS")

tab <- asv_table_16S[apply(asv_table_16S, 1, sum)>0.001*sum(asv_table_16S),]
taxo <- assign_16S[rownames(tab),]
res_tot <- rf.mtry.taxo(tab, taxo,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_0001cutoff_mtry_20_nfold_5_taxo_16S.RDS")


#ITS
z <- samples_ITS
tab <- asv_table_ITS
taxo <- assign_ITS

res_tot <- rf.mtry.taxo(tab, taxo, 
                        treat = z$irrigation, 
                        n_mtry = 20)
saveRDS(res_tot, "rf_irr_mtry_20_taxo_ITS.RDS")

res_tot <- rf.mtry.taxo(tab, taxo, n_fold = 3,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_mtry_20_nfold_3_taxo_ITS.RDS")

tab <- asv_table_ITS[apply(asv_table_ITS, 1, sum)>0.001*sum(asv_table_ITS),]
taxo <- assign_ITS[rownames(tab),]
res_tot <- rf.mtry.taxo(tab, taxo,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_0001cutoff_mtry_20_nfold_5_taxo_ITS.RDS")

# Both

foo <- c(colnames(asv_table_16S), colnames(asv_table_ITS))
foo <- foo[duplicated(foo)]
tab <- rbind(asv_table_16S[,foo], asv_table_ITS[,foo])
taxo <- rbind(assign_16S, assign_ITS)
z <- samples_16S[match(foo, rownames(samples_16S)),]

res_tot <- rf.mtry.taxo(tab, taxo, 
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_mtry_20_taxo_16S_ITS.RDS")

res_tot <- rf.mtry.taxo(tab, taxo, n_fold = 3,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_mtry_20_nfold_3_taxo_16S_ITS.RDS")

p <- apply(tab, 1, sum)>0.001*sum(tab)
tab <- tab[p,]
taxo <- taxo[p,]
res_tot <- rf.mtry.taxo(tab, taxo,
                        treat = z$irrigation, 
                        n_mtry = 20)

saveRDS(res_tot, "rf_irr_0001cutoff_mtry_20_nfold_5_taxo_16S_ITS.RDS")

Here is a graph that shows the precision and specificity of predictions obtained with several sizes of the dataset and taxonomic level. Colored dots represent the predictions obtained with the mtry that gave the lowest error rate. Other mtry values are in gray.

foo <- list.files(pattern = "rf_irr.*.RDS")

par(mfrow=c(1,1), bty="l",las=1, mar=c(4,4,2,1), col.axis="gray", cex.lab=1.5)
plot(c(0.6,1), c(0.6,1), type="n", xlab="Precision", ylab="Sensitivity")
for (j in 1:length(foo)) {
  d <- readRDS(foo[j])
  for (i in 1:length(d)) {
    points(sensitivity~precision, 
           pch=c(1:length(foo))[j], col="lightgray",
           data=d[[i]][-which.min(d[[i]]$err_mean),])
  }
}
for (j in 1:length(foo)) {
  d <- readRDS(foo[j])
  for (i in 1:length(d)) {
    points(sensitivity~precision, cex=2, 
           pch=c(1:length(foo))[j], col=c(1:length(d))[i],
           data=d[[i]][which.min(d[[i]]$err_mean),])
  }
}
legend("topleft", pch=1:length(foo), col=1,bty = "n",
       legend=gsub("rf_irr_(.*)\\.RDS","\\1",foo),
       cex = 0.7)

legend("left", pch=1, col=1:length(d),bty = "n",
       legend=names(d),
       cex = 0.7)

# plot(res_mean~mtry, type='n', ylim=c(0,1), #xlim=c(0,200),
#      data=rf_16S[["ASV"]])
# lapply(rf_16S, function(x){
#   lines(res_mean~mtry, ylim=c(0,1), type="b", data=x)
#   # segments(y0=x[,"res_mean"]+x[,"res_sd"], 
#   #          x0=x[,"mtry"], 
#   #          y1=x[,"res_mean"]-x[,"res_sd"])
#   polygon(c(x[,"mtry"], rev(x[,"mtry"])),
#           c(x[,"res_mean"]+x[,"res_sd"],
#             rev(x[,"res_mean"]-x[,"res_sd"])),
#           col=adjustcolor("gray", alpha.f = 0.5),
#           border=adjustcolor("gray", alpha.f = 0.5)
#   )
# })

For the ASV level which have the best predictive power, 3 fold or 5 fold cross validations seemed to lead to the same results. I will thus remove 3-fold cross validation to improve readability.

foo <- list.files(pattern = "rf_irr.*.RDS")
foo <- foo[-grep("nfold_3", foo)]

par(mfrow=c(1,1), bty="l",las=1, mar=c(4,4,2,1), col.axis="gray", cex.lab=1.5)
plot(c(0.6,1), c(0.6,1), type="n", xlab="Precision", ylab="Sensitivity")
res_err_rate <- NULL
for (j in 1:length(foo)) {
  d <- readRDS(foo[j])
  for (i in 1:length(d)) {
    points(sensitivity~precision, 
           pch=c(0,1,2,15,16,17)[j], col="lightgray",
           data=d[[i]][-which.min(d[[i]]$err_mean),])
    res_err_rate <- c(res_err_rate, min(d[[i]]$err_mean))
  }
}
for (j in 1:length(foo)) {
  d <- readRDS(foo[j])
  for (i in 1:length(d)) {
    points(sensitivity~precision, cex=2, 
           pch=c(0,1,2,15,16,17)[j], col=c(1:length(d))[i],
           data=d[[i]][which.min(d[[i]]$err_mean),])
  }
}
legend("topleft", pch=c(0,1,2,15,16,17), col=1,bty = "n",
       legend=gsub("rf_irr_(.*)\\.RDS","\\1",foo),
       cex = 0.7)

legend("left", pch=1, col=1:length(d),bty = "n",
       legend=names(d),
       cex = 0.7)

In the best case, the error rate is 5 %.

Conclusions for random forest analysis:

The best prediction is obtained by

  • trying several mtry values
  • combining both 16S and ITS data
  • at the ASV level (no need for a clean reference database!)
  • including or not rare ASVs (no need for deep sequencing depth)

Session info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.7.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.7.0
## 
## locale:
##  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
##  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
##  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ranger_0.12.1      kableExtra_1.1.0   knitr_1.28         RColorBrewer_1.1-2
## [5] ape_5.3            beanplot_1.2       vegan_2.5-6        lattice_0.20-38   
## [9] permute_0.9-5     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3        compiler_3.6.1    pillar_1.4.3      tools_3.6.1      
##  [5] digest_0.6.24     viridisLite_0.3.0 lifecycle_0.1.0   evaluate_0.14    
##  [9] tibble_2.1.3      nlme_3.1-140      mgcv_1.8-28       pkgconfig_2.0.3  
## [13] rlang_0.4.4       Matrix_1.2-17     rstudioapi_0.11   yaml_2.2.1       
## [17] parallel_3.6.1    xfun_0.12         xml2_1.2.2        stringr_1.4.0    
## [21] httr_1.4.1        cluster_2.1.0     vctrs_0.2.2       hms_0.5.3        
## [25] webshot_0.5.2     grid_3.6.1        glue_1.3.1        R6_2.4.1         
## [29] rmarkdown_2.1     readr_1.3.1       magrittr_1.5      scales_1.1.0     
## [33] htmltools_0.4.0   MASS_7.3-51.4     splines_3.6.1     rvest_0.3.5      
## [37] colorspace_1.4-1  stringi_1.4.5     munsell_0.5.0     crayon_1.3.4